{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas批量拆分与合并Excel文件"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "实例演示：\n",
    "1. 将一个大Excel等份拆成多个Excel\n",
    "2. 将多个小Excel合并成一个大Excel并标记来源"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "work_dir='../datas/split_to_merge'\n",
    "split_dir=f\"{work_dir}/split\"\n",
    "\n",
    "import os \n",
    "if not os.path.exists(split_dir):\n",
    "    os.mkdir(split_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0. 读取源Excel到Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_source = pd.read_excel(f\"{work_dir}/data_level.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>ext_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11</td>\n",
       "      <td>北京</td>\n",
       "      <td>北京市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1101</td>\n",
       "      <td>北京</td>\n",
       "      <td>北京市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>110101</td>\n",
       "      <td>东城</td>\n",
       "      <td>东城区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>110102</td>\n",
       "      <td>西城</td>\n",
       "      <td>西城区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>110105</td>\n",
       "      <td>朝阳</td>\n",
       "      <td>朝阳区</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id name ext_name\n",
       "0      11   北京      北京市\n",
       "1    1101   北京      北京市\n",
       "2  110101   东城      东城区\n",
       "3  110102   西城      西城区\n",
       "4  110105   朝阳      朝阳区"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_source.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=3637, step=1)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_source.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3637, 3)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_source.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3637"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "total_row_count = df_source.shape[0]\n",
    "total_row_count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 一. 将一个大Excel等份拆成多个Excel\n",
    "1. 使用df.iloc方法，将一个大的dataframe，拆分成多个小dataframe\n",
    "2. 将使用dataframe.to_excel保存到每个小Excel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 计算拆分后的每个Excel的行数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将这个大的excel,拆分给这几个人\n",
    "user_names = [\"xiao_shuai\",\"xiao_wang\",\"xiao_ming\",\"xiao_lei\",\"xiao_bo\",\"xiao_hong\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "607"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 每个人的任务数目\n",
    "split_size = total_row_count // len(user_names)\n",
    "if total_row_count % len(user_names) != 0:\n",
    "    split_size += 1\n",
    "    \n",
    "split_size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 拆分成多个dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "df_subs = []\n",
    "for idx,user_name in enumerate(user_names):\n",
    "    # iloc的开始索引\n",
    "    begin = idx*split_size\n",
    "    # iloc的结束索引\n",
    "    end = begin+split_size\n",
    "    # 实现df按照iloc划分\n",
    "    df_sub = df_source.iloc[begin:end]\n",
    "    #将每个子df存入列表\n",
    "    df_subs.append((idx,user_name,df_sub))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 将每个dataframe存入excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx,user_name,df_sub in df_subs:\n",
    "    file_name = f\"{split_dir}/data_level_{idx}_{user_name}.xlsx\"\n",
    "    df_sub.to_excel(file_name,index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二. 合并多个小Excel到一个大Excel\n",
    "1. 遍历文件夹，得到要合并的Excel文件列表;\n",
    "2. 分别读取到dataframe，给每个df添加一列用于标记来源;\n",
    "3. 使用pd.concat进行df批量合并;\n",
    "4. 将合并后的dataframe输出到excel;"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.遍历文件夹，得到要合并的Excel文件列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['data_level_0_xiao_shuai.xlsx',\n",
       " 'data_level_1_xiao_wang.xlsx',\n",
       " 'data_level_2_xiao_ming.xlsx',\n",
       " 'data_level_3_xiao_lei.xlsx',\n",
       " 'data_level_4_xiao_bo.xlsx',\n",
       " 'data_level_5_xiao_hong.xlsx']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "excel_names=[]\n",
    "for excel_name in os.listdir(split_dir):\n",
    "    excel_names.append(excel_name)\n",
    "\n",
    "excel_names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 分别读取到dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_level_0_xiao_shuai.xlsx xiao_shuai\n",
      "data_level_1_xiao_wang.xlsx xiao_wang\n",
      "data_level_2_xiao_ming.xlsx xiao_ming\n",
      "data_level_3_xiao_lei.xlsx xiao_lei\n",
      "data_level_4_xiao_bo.xlsx xiao_bo\n",
      "data_level_5_xiao_hong.xlsx xiao_hong\n"
     ]
    }
   ],
   "source": [
    "df_list = []\n",
    "\n",
    "for excel_name in excel_names:\n",
    "    # 读取每个excel到df\n",
    "    excel_path=f\"{split_dir}/{excel_name}\"\n",
    "    df_split = pd.read_excel(excel_path)\n",
    "    # 得到username\n",
    "    username = excel_name.replace(\"data_level_\",\"\").replace(\".xlsx\",\"\")[2:]\n",
    "    print(excel_name,username)\n",
    "    # 给每个df添加1列，即用户名字\n",
    "    df_split[\"username\"] = username\n",
    "    \n",
    "    df_list.append(df_split)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 使用pd.concat进行合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_merged = pd.concat(df_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3637, 4)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merged.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>ext_name</th>\n",
       "      <th>username</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11</td>\n",
       "      <td>北京</td>\n",
       "      <td>北京市</td>\n",
       "      <td>xiao_shuai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1101</td>\n",
       "      <td>北京</td>\n",
       "      <td>北京市</td>\n",
       "      <td>xiao_shuai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>110101</td>\n",
       "      <td>东城</td>\n",
       "      <td>东城区</td>\n",
       "      <td>xiao_shuai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>110102</td>\n",
       "      <td>西城</td>\n",
       "      <td>西城区</td>\n",
       "      <td>xiao_shuai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>110105</td>\n",
       "      <td>朝阳</td>\n",
       "      <td>朝阳区</td>\n",
       "      <td>xiao_shuai</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id name ext_name    username\n",
       "0      11   北京      北京市  xiao_shuai\n",
       "1    1101   北京      北京市  xiao_shuai\n",
       "2  110101   东城      东城区  xiao_shuai\n",
       "3  110102   西城      西城区  xiao_shuai\n",
       "4  110105   朝阳      朝阳区  xiao_shuai"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_merged.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. 将合并后的dataframe输出到excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_merged.to_excel(f\"{work_dir}/data_level_merged.xlsx\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
